Do an exploratory data analysis of a matrix of expression values. The data consists of expression values for samples that were treated with DMSO and TSA. The samples were measured using three technologies: bulk, IFC96, IFC800. See the two RDS files counts.RDS and phenodata.RDS
x = readRDS("counts.RDS")
anno = readRDS("phenodata.RDS")
head(anno)
## Treatment Technology
## bulk.DMSO-Pre-96IFC DMSO bulk
## bulk.TSA-Pre-96IFC TSA bulk
## bulk.DMSO-Pre-800IFC DMSO bulk
## bulk.TSA-Pre-800IFC TSA bulk
## bulk.DMSO-Post-800IFC DMSO bulk
## bulk.TSA-Post-800IFC TSA bulk
How many replicates are there for each combination of factor levels?
unique_tech<-unique(anno$Technology)
unique_treat<-unique(anno$Treatment)
row<-length(unique_tech)
col<-length(unique_treat)
frequency_table<-matrix(0,nrow = row, ncol = col)
rownames(frequency_table)<-unique_tech
colnames(frequency_table)<-unique_treat
n=length(anno$Treatment)
for (i in 1:n){
frequency_table[anno$Technology[i],anno$Treatment[i]]=frequency_table[anno$Technology[i],anno$Treatment[i]]+1
}
frequency_table
## DMSO TSA
## bulk 3 3
## IFC96 20 20
## IFC800 20 20
How many genes have an expression value above 0 in each sample? x>0 converts matrix to a matrix with same dimension but values of entries is 0 or 1 colSums: sum over the columns (sum over each samples). does sum over columns for all columns. it gives a row vector genes_above_zero: a vector that has as many as components as there are for columns
#x>0 converts matrix to a matrix with same dimension but values of entries is 0 or 1
#colSums: sum over the columns (sum over each samples). does sum over for a given column and you do that for all columns.
#colSums gives a row vector
#genes_above_zero: a vector that has as many as components as there are for columns
genes_above_zero = colSums(x > 0)
head(genes_above_zero)
## bulk.DMSO-Pre-96IFC bulk.TSA-Pre-96IFC bulk.DMSO-Pre-800IFC
## 15163 15261 15404
## bulk.TSA-Pre-800IFC bulk.DMSO-Post-800IFC bulk.TSA-Post-800IFC
## 16182 15202 16167
Scale the columns so that the total sum of all columns are identical
scale= centers and scales. centering: subtract the average from every column every column is 0 centered. didnt want to do this scale:should pass a vector which has same elements as there are columns. x has 86 columns. this vector should have 86 elements colSums(x) gives a vector contains as many cols as there are in cols in x. each column is going to be divided by sum of each columns
# Multiply by a million to better separate 0 from the other values
x_scaled = scale(x, center = FALSE, scale = colSums(x)*1e-6)
head(colSums(x_scaled))
## bulk.DMSO-Pre-96IFC bulk.TSA-Pre-96IFC bulk.DMSO-Pre-800IFC
## 1e+06 1e+06 1e+06
## bulk.TSA-Pre-800IFC bulk.DMSO-Post-800IFC bulk.TSA-Post-800IFC
## 1e+06 1e+06 1e+06
Use the function log1p to transform the data to log-scale log1p=log(1+x) to prevent crash of log0
x_log = log1p(x_scaled)
This is pre-processing step for violin plots and box plots
# prepare the data-frame and the colors for each plot
x_data_frame<-as.data.frame(x_log)
ncol=ncol(x_data_frame)
nrow=nrow(x_data_frame)
# x_color contains color information for each sample
x_color<-vector(mode="character", length=ncol)
for (i in 1:ncol){
treat<-anno[i,1]
tech<-anno[i,2]
if (treat=="DMSO"&& tech=="bulk"){
x_color[i]<-"DMSO+bulk"
}
else if (treat=="TSA"&& tech=="bulk"){
x_color[i]<-"TSA+bulk"
}
else if (treat=="DMSO"&& tech=="IFC96"){
x_color[i]<-"DMSO+IFC96"
}
else if (treat=="TSA"&& tech=="IFC96"){
x_color[i]<-"TSA+IFC96"
}
else if (treat=="DMSO"&& tech=="IFC800"){
x_color[i]<-"DMSO+IFC800"
}
else if (treat=="TSA"&& tech=="IFC800"){
x_color[i]<-"TSA+IFC800"
}
else{
x_color[i]<-"unknown"
}
}
Use violin plots to visualize the distribution of the expression values Group and color by experimental factors.
library(ggplot2)
#assign a new variable
flat_expressions<-x_log
#setting dimension of flat_X to NULL means it transforms to a single vector. (flattening the gene expression into one long vector)
dim(flat_expressions)<-NULL
# Create a vector of sample names, to be used as factor
# If you use "each" parameter it works like this:1 is repeated nrow times, 2 repeated nrow times,......6 repeated nrow times etc until 86
flat_names<-rep(1:ncol, each=nrow)
flat_colors<-rep(x_color, each=nrow)
# Create the flat data frame. put those three columns into one dataframe.
# x_flat_data_frame has three columns. 1st col=flat_names(sample name) 2nd col=flat_expression(gene expression)
# 3rd col=flat_colors(color)
x_flat_data_frame<-data.frame(flat_expressions, flat_names, flat_colors)
names(x_flat_data_frame)<-c("gene_expression", "sample_name", "color")
# Create a simple violin plot
# aes defines what you should have in x and y.
# x is sample name. you have to transform 1~86 to factor.
# take all y values that correspond to same x and make 1 violin.
p<-ggplot(x_flat_data_frame, aes(x=factor(x_flat_data_frame$sample_name), y=x_flat_data_frame$gene_expression, fill=x_flat_data_frame$color))
p+geom_violin()
To make violin plot more visible, I will just use first few samples.
flat_expressions_reduced<-x_log[,1:6]
dim(flat_expressions_reduced)<-NULL
flat_names_reduced<-rep(1:6, each=nrow)
flat_colors_reduced<-rep(x_color[1:6], each=nrow)
x_flat_df_reduced<-data.frame(flat_expressions_reduced, flat_names_reduced, flat_colors_reduced)
names(x_flat_df_reduced)<-c("gene_expression", "sample_name", "color")
p<-ggplot(x_flat_df_reduced, aes(x=factor(x_flat_df_reduced$sample_name), y=x_flat_df_reduced$gene_expression, fill=x_flat_df_reduced$color))
p+geom_violin()
use boxplots to visualize the distribution of the expression values Group and color by experimental factors.
# Prepare the data for the boxplot
#x_at for box plot. it tells me at which line exactly a particular box plot should be drawn.
#x_at is a vector 1~86
x_at = 1:ncol
x_names = paste("s", 1:ncol)
# Box plot
boxplot(x_data_frame,
main="gene expression",
at=x_at,
# at is defining position fo plot appears
xlab="normalized expression",
names=x_names,
las=2,
border=factor(x_color),
horizontal=TRUE)
To make box plot more visible, I will just use first few samples.
# Box plot
#x_data_frame[1:6] this selects first 6 columns(variables)
boxplot(x_data_frame[1:15],
main = "Gene expression",
#at is defining position fo plot appears
# 1st plot appeas 1st position, 2nd plot appears in 4th position......
at = c(1,4,2,5,3,6,8,9,7,11,10,12,14,13,15),
xlab = "normalized expression",
names = x_names[1:15],
las = 2,
border = factor(x_color[1:15]),
horizontal = TRUE
)
Identify the 500 most variable genes (with largest variance across samples) and continue working with those.
# Calculate the indexes of x, ordered according to the variance
# apply is a function that applies a certain function to either all columns or all rows of the dataframe.
# 1 means->apply to the rows
# 2 means_> apply to columns
# function->var is the function you want to apply
# calculates variance of each row
x_variance<-apply(x_data_frame, 1, var)
# order returns the vector of indices of ordered vector.
# na.last=T means if they are not assigned values they end up beind at the end
# decreasing=T from biggest to smallest
row_indexes<-order(x_variance, na.last=TRUE, decreasing=TRUE)
#head(row_indexes)
# Select the genes with the highest variance
# to select 500 points
x_top_500<-head(x_data_frame[row_indexes,],500)
head(x_top_500)
## bulk.DMSO-Pre-96IFC bulk.TSA-Pre-96IFC bulk.DMSO-Pre-800IFC
## EEF1A1 8.014777 7.437697 8.031036
## NUDT19 3.256398 3.483093 3.235704
## RRM2 6.235273 5.406863 6.200634
## TSPYL1 4.095684 5.013038 4.113957
## SLC35F2 4.369765 3.891466 4.268217
## RAP2B 4.615777 5.694864 4.761932
## bulk.TSA-Pre-800IFC bulk.DMSO-Post-800IFC bulk.TSA-Post-800IFC
## EEF1A1 7.437673 7.798348 7.568173
## NUDT19 3.043603 3.268930 2.914378
## RRM2 4.991755 6.153708 4.997477
## TSPYL1 4.896899 4.003042 4.901664
## SLC35F2 3.665405 4.199482 3.793348
## RAP2B 5.569699 4.700821 5.565621
## ifc96.LIB018588_GEN00046618_S78_L001
## EEF1A1 8.138783
## NUDT19 5.310767
## RRM2 0.000000
## TSPYL1 6.697922
## SLC35F2 5.776628
## RAP2B 7.298507
## ifc96.LIB018588_GEN00046619_S79_L001
## EEF1A1 7.937117
## NUDT19 6.191110
## RRM2 6.953878
## TSPYL1 6.963859
## SLC35F2 6.277592
## RAP2B 7.693593
## ifc96.LIB018588_GEN00046620_S80_L001
## EEF1A1 6.174591
## NUDT19 6.174591
## RRM2 0.000000
## TSPYL1 0.000000
## SLC35F2 0.000000
## RAP2B 0.000000
## ifc96.LIB018588_GEN00046621_S81_L001
## EEF1A1 8.171921
## NUDT19 6.380830
## RRM2 0.000000
## TSPYL1 6.800486
## SLC35F2 6.057429
## RAP2B 7.884581
## ifc96.LIB018588_GEN00046622_S82_L001
## EEF1A1 6.808619
## NUDT19 5.139416
## RRM2 0.000000
## TSPYL1 0.000000
## SLC35F2 0.000000
## RAP2B 4.052458
## ifc96.LIB018588_GEN00046623_S83_L001
## EEF1A1 7.916533
## NUDT19 7.164232
## RRM2 7.111143
## TSPYL1 6.374779
## SLC35F2 6.081471
## RAP2B 7.852169
## ifc96.LIB018588_GEN00046624_S84_L001
## EEF1A1 7.954351
## NUDT19 7.370651
## RRM2 5.408860
## TSPYL1 8.344757
## SLC35F2 6.260206
## RAP2B 7.549062
## ifc96.LIB018588_GEN00046625_S85_L001
## EEF1A1 7.891281
## NUDT19 6.327853
## RRM2 0.000000
## TSPYL1 6.594290
## SLC35F2 5.986242
## RAP2B 7.553178
## ifc96.LIB018588_GEN00046626_S86_L001
## EEF1A1 0.000000
## NUDT19 0.000000
## RRM2 3.193615
## TSPYL1 0.000000
## SLC35F2 0.000000
## RAP2B 3.369076
## ifc96.LIB018588_GEN00046627_S87_L001
## EEF1A1 7.771700
## NUDT19 7.026673
## RRM2 0.000000
## TSPYL1 6.928288
## SLC35F2 6.739637
## RAP2B 6.980345
## ifc96.LIB018588_GEN00046628_S88_L001
## EEF1A1 4.958238
## NUDT19 5.361359
## RRM2 0.000000
## TSPYL1 4.272091
## SLC35F2 0.000000
## RAP2B 4.958238
## ifc96.LIB018588_GEN00046629_S89_L001
## EEF1A1 7.952244
## NUDT19 5.888708
## RRM2 0.000000
## TSPYL1 5.391508
## SLC35F2 5.601949
## RAP2B 7.149635
## ifc96.LIB018588_GEN00046630_S90_L001
## EEF1A1 7.914671
## NUDT19 6.728051
## RRM2 4.249541
## TSPYL1 6.865012
## SLC35F2 6.596617
## RAP2B 7.071628
## ifc96.LIB018588_GEN00046631_S91_L001
## EEF1A1 8.316330
## NUDT19 6.631053
## RRM2 3.377311
## TSPYL1 6.519324
## SLC35F2 6.283606
## RAP2B 7.748913
## ifc96.LIB018588_GEN00046632_S92_L001
## EEF1A1 7.654020
## NUDT19 6.544538
## RRM2 4.283393
## TSPYL1 6.497287
## SLC35F2 6.193856
## RAP2B 7.597825
## ifc96.LIB018588_GEN00046633_S93_L001
## EEF1A1 8.026738
## NUDT19 6.200969
## RRM2 0.000000
## TSPYL1 6.156904
## SLC35F2 6.574355
## RAP2B 7.836884
## ifc96.LIB018588_GEN00046634_S94_L001
## EEF1A1 7.860224
## NUDT19 6.424661
## RRM2 5.794096
## TSPYL1 6.346750
## SLC35F2 6.233632
## RAP2B 7.279995
## ifc96.LIB018588_GEN00046635_S95_L001
## EEF1A1 7.977083
## NUDT19 6.193714
## RRM2 6.503324
## TSPYL1 6.623600
## SLC35F2 6.141573
## RAP2B 7.150759
## ifc96.LIB018588_GEN00046636_S96_L001
## EEF1A1 7.249692
## NUDT19 6.820064
## RRM2 0.000000
## TSPYL1 6.494089
## SLC35F2 7.061093
## RAP2B 7.720114
## ifc96.LIB018589_GEN00046637_S97_L002
## EEF1A1 8.381627
## NUDT19 6.035540
## RRM2 5.828453
## TSPYL1 5.488037
## SLC35F2 5.964808
## RAP2B 6.357653
## ifc96.LIB018589_GEN00046638_S98_L002
## EEF1A1 8.110863
## NUDT19 6.879199
## RRM2 6.956217
## TSPYL1 6.284805
## SLC35F2 6.265127
## RAP2B 6.585469
## ifc96.LIB018589_GEN00046639_S99_L002
## EEF1A1 8.105260
## NUDT19 7.041037
## RRM2 6.095102
## TSPYL1 6.409140
## SLC35F2 6.178175
## RAP2B 6.331185
## ifc96.LIB018589_GEN00046640_S100_L002
## EEF1A1 8.535092
## NUDT19 4.742636
## RRM2 6.685059
## TSPYL1 5.431416
## SLC35F2 6.269156
## RAP2B 6.420866
## ifc96.LIB018589_GEN00046641_S101_L002
## EEF1A1 8.147820
## NUDT19 7.263454
## RRM2 6.921044
## TSPYL1 5.879259
## SLC35F2 6.109100
## RAP2B 6.965006
## ifc96.LIB018589_GEN00046642_S102_L002
## EEF1A1 7.997022
## NUDT19 6.930602
## RRM2 7.006324
## TSPYL1 5.681586
## SLC35F2 6.590318
## RAP2B 6.582921
## ifc96.LIB018589_GEN00046643_S103_L002
## EEF1A1 8.283312
## NUDT19 6.029990
## RRM2 6.390274
## TSPYL1 5.324717
## SLC35F2 6.472512
## RAP2B 6.640935
## ifc96.LIB018589_GEN00046644_S104_L002
## EEF1A1 8.290907
## NUDT19 6.127205
## RRM2 0.000000
## TSPYL1 6.053837
## SLC35F2 0.000000
## RAP2B 7.016507
## ifc96.LIB018589_GEN00046645_S105_L002
## EEF1A1 8.344282
## NUDT19 7.234195
## RRM2 6.876042
## TSPYL1 5.750232
## SLC35F2 6.770949
## RAP2B 6.251682
## ifc96.LIB018589_GEN00046646_S106_L002
## EEF1A1 8.389450
## NUDT19 5.750171
## RRM2 0.000000
## TSPYL1 6.123174
## SLC35F2 6.368597
## RAP2B 6.607249
## ifc96.LIB018589_GEN00046647_S107_L002
## EEF1A1 8.353071
## NUDT19 6.916726
## RRM2 1.022788
## TSPYL1 6.347214
## SLC35F2 5.456614
## RAP2B 6.628197
## ifc96.LIB018589_GEN00046648_S108_L002
## EEF1A1 8.589233
## NUDT19 6.547093
## RRM2 0.000000
## TSPYL1 6.071801
## SLC35F2 6.560633
## RAP2B 7.404233
## ifc96.LIB018589_GEN00046649_S109_L002
## EEF1A1 8.195026
## NUDT19 5.048757
## RRM2 7.864826
## TSPYL1 5.807469
## SLC35F2 6.575191
## RAP2B 5.358975
## ifc96.LIB018589_GEN00046650_S110_L002
## EEF1A1 7.954024
## NUDT19 6.398260
## RRM2 0.000000
## TSPYL1 5.788114
## SLC35F2 6.752006
## RAP2B 6.582323
## ifc96.LIB018589_GEN00046651_S111_L002
## EEF1A1 8.5920617
## NUDT19 6.4603136
## RRM2 0.7674101
## TSPYL1 5.5087417
## SLC35F2 6.4870358
## RAP2B 6.5384287
## ifc96.LIB018589_GEN00046652_S112_L002
## EEF1A1 8.320613
## NUDT19 6.767691
## RRM2 7.059247
## TSPYL1 5.799263
## SLC35F2 6.664124
## RAP2B 5.707324
## ifc96.LIB018589_GEN00046653_S113_L002
## EEF1A1 8.484218
## NUDT19 5.719387
## RRM2 3.452177
## TSPYL1 5.437099
## SLC35F2 5.952183
## RAP2B 6.705031
## ifc96.LIB018589_GEN00046654_S114_L002
## EEF1A1 8.324573
## NUDT19 6.037365
## RRM2 0.000000
## TSPYL1 5.218118
## SLC35F2 6.558652
## RAP2B 7.040520
## ifc96.LIB018589_GEN00046655_S115_L002
## EEF1A1 8.217502
## NUDT19 5.096100
## RRM2 6.581880
## TSPYL1 5.282592
## SLC35F2 5.921842
## RAP2B 6.359082
## ifc96.LIB018589_GEN00046656_S116_L002
## EEF1A1 8.933926
## NUDT19 5.026772
## RRM2 1.944360
## TSPYL1 4.835620
## SLC35F2 5.702456
## RAP2B 6.333502
## ifc96.LIB018588_GEN00046541_S1_L001
## EEF1A1 7.969540
## NUDT19 6.288428
## RRM2 6.517274
## TSPYL1 6.626784
## SLC35F2 6.438923
## RAP2B 8.068990
## ifc800.COL03_ROW01_LIB018723-GEN00047367
## EEF1A1 0.000000
## NUDT19 0.000000
## RRM2 6.002152
## TSPYL1 0.000000
## SLC35F2 2.906037
## RAP2B 3.721583
## ifc800.COL03_ROW02_LIB018723-GEN00047367
## EEF1A1 0.000000
## NUDT19 0.000000
## RRM2 5.530829
## TSPYL1 2.180958
## SLC35F2 3.478748
## RAP2B 4.635817
## ifc800.COL03_ROW03_LIB018723-GEN00047367
## EEF1A1 0.000000
## NUDT19 0.000000
## RRM2 6.863288
## TSPYL1 4.833351
## SLC35F2 0.000000
## RAP2B 0.000000
## ifc800.COL03_ROW04_LIB018723-GEN00047367
## EEF1A1 2.700594
## NUDT19 0.000000
## RRM2 6.378340
## TSPYL1 0.000000
## SLC35F2 0.000000
## RAP2B 0.000000
## ifc800.COL03_ROW05_LIB018723-GEN00047367
## EEF1A1 1.842936
## NUDT19 0.000000
## RRM2 5.963576
## TSPYL1 1.842936
## SLC35F2 0.000000
## RAP2B 0.000000
## ifc800.COL03_ROW06_LIB018723-GEN00047367
## EEF1A1 0.000000
## NUDT19 4.455511
## RRM2 6.842792
## TSPYL1 0.000000
## SLC35F2 0.000000
## RAP2B 0.000000
## ifc800.COL03_ROW07_LIB018723-GEN00047367
## EEF1A1 0.000000
## NUDT19 0.000000
## RRM2 6.111146
## TSPYL1 3.572464
## SLC35F2 0.000000
## RAP2B 3.356318
## ifc800.COL03_ROW08_LIB018723-GEN00047367
## EEF1A1 2.390713
## NUDT19 0.000000
## RRM2 7.099528
## TSPYL1 0.000000
## SLC35F2 3.924078
## RAP2B 2.390713
## ifc800.COL03_ROW09_LIB018723-GEN00047367
## EEF1A1 0.000000
## NUDT19 0.000000
## RRM2 6.313580
## TSPYL1 0.000000
## SLC35F2 0.000000
## RAP2B 1.808619
## ifc800.COL03_ROW10_LIB018723-GEN00047367
## EEF1A1 0.000000
## NUDT19 0.000000
## RRM2 6.357654
## TSPYL1 0.000000
## SLC35F2 0.000000
## RAP2B 2.010987
## ifc800.COL03_ROW11_LIB018723-GEN00047367
## EEF1A1 0.000000
## NUDT19 0.000000
## RRM2 4.468720
## TSPYL1 0.000000
## SLC35F2 2.466456
## RAP2B 0.000000
## ifc800.COL03_ROW12_LIB018723-GEN00047367
## EEF1A1 4.321119
## NUDT19 0.000000
## RRM2 5.932262
## TSPYL1 0.000000
## SLC35F2 3.511377
## RAP2B 2.973906
## ifc800.COL03_ROW13_LIB018723-GEN00047367
## EEF1A1 0.000000
## NUDT19 0.000000
## RRM2 6.179237
## TSPYL1 0.000000
## SLC35F2 0.000000
## RAP2B 4.931641
## ifc800.COL03_ROW14_LIB018723-GEN00047367
## EEF1A1 3.698217
## NUDT19 0.000000
## RRM2 7.041309
## TSPYL1 0.000000
## SLC35F2 0.000000
## RAP2B 2.647952
## ifc800.COL03_ROW15_LIB018723-GEN00047367
## EEF1A1 2.511544
## NUDT19 0.000000
## RRM2 6.888084
## TSPYL1 3.950430
## SLC35F2 0.000000
## RAP2B 0.000000
## ifc800.COL03_ROW16_LIB018723-GEN00047367
## EEF1A1 0.000000
## NUDT19 0.000000
## RRM2 5.350289
## TSPYL1 0.000000
## SLC35F2 2.916280
## RAP2B 0.000000
## ifc800.COL03_ROW17_LIB018723-GEN00047367
## EEF1A1 1.977966
## NUDT19 0.000000
## RRM2 5.982625
## TSPYL1 2.979810
## SLC35F2 1.414399
## RAP2B 1.414399
## ifc800.COL03_ROW18_LIB018723-GEN00047367
## EEF1A1 0.000000
## NUDT19 0.000000
## RRM2 6.432380
## TSPYL1 0.000000
## SLC35F2 3.791014
## RAP2B 0.000000
## ifc800.COL03_ROW19_LIB018723-GEN00047367
## EEF1A1 0.000000
## NUDT19 0.000000
## RRM2 4.706882
## TSPYL1 0.000000
## SLC35F2 0.000000
## RAP2B 0.000000
## ifc800.COL03_ROW20_LIB018723-GEN00047367
## EEF1A1 3.445719
## NUDT19 2.783956
## RRM2 6.561039
## TSPYL1 0.000000
## SLC35F2 0.000000
## RAP2B 0.000000
## ifc800.COL11_ROW01_LIB018723-GEN00047362
## EEF1A1 0.000000
## NUDT19 0.000000
## RRM2 5.688547
## TSPYL1 5.688547
## SLC35F2 0.000000
## RAP2B 0.000000
## ifc800.COL11_ROW02_LIB018723-GEN00047362
## EEF1A1 0
## NUDT19 0
## RRM2 0
## TSPYL1 0
## SLC35F2 0
## RAP2B 0
## ifc800.COL11_ROW03_LIB018723-GEN00047362
## EEF1A1 2.913392
## NUDT19 0.000000
## RRM2 0.000000
## TSPYL1 3.579019
## SLC35F2 4.478426
## RAP2B 0.000000
## ifc800.COL11_ROW04_LIB018723-GEN00047362
## EEF1A1 0.00000
## NUDT19 0.00000
## RRM2 0.00000
## TSPYL1 5.55451
## SLC35F2 0.00000
## RAP2B 0.00000
## ifc800.COL11_ROW05_LIB018723-GEN00047362
## EEF1A1 0.000000
## NUDT19 0.000000
## RRM2 5.449964
## TSPYL1 0.000000
## SLC35F2 0.000000
## RAP2B 0.000000
## ifc800.COL11_ROW06_LIB018723-GEN00047362
## EEF1A1 0
## NUDT19 0
## RRM2 0
## TSPYL1 0
## SLC35F2 0
## RAP2B 0
## ifc800.COL11_ROW07_LIB018723-GEN00047362
## EEF1A1 4.285605
## NUDT19 1.562528
## RRM2 6.234542
## TSPYL1 2.144964
## SLC35F2 2.510623
## RAP2B 2.777792
## ifc800.COL11_ROW08_LIB018723-GEN00047362
## EEF1A1 3.704843
## NUDT19 4.687710
## RRM2 5.322433
## TSPYL1 0.000000
## SLC35F2 0.000000
## RAP2B 1.896627
## ifc800.COL11_ROW09_LIB018723-GEN00047362
## EEF1A1 4.080012
## NUDT19 1.765674
## RRM2 5.930990
## TSPYL1 0.000000
## SLC35F2 0.000000
## RAP2B 3.797949
## ifc800.COL11_ROW10_LIB018723-GEN00047362
## EEF1A1 3.294549
## NUDT19 0.000000
## RRM2 0.000000
## TSPYL1 0.000000
## SLC35F2 3.572917
## RAP2B 2.907457
## ifc800.COL11_ROW11_LIB018723-GEN00047362
## EEF1A1 4.330370
## NUDT19 0.000000
## RRM2 6.653784
## TSPYL1 0.000000
## SLC35F2 4.330370
## RAP2B 5.170117
## ifc800.COL11_ROW12_LIB018723-GEN00047362
## EEF1A1 4.557685
## NUDT19 3.527150
## RRM2 4.273492
## TSPYL1 1.422003
## SLC35F2 1.422003
## RAP2B 3.479829
## ifc800.COL11_ROW13_LIB018723-GEN00047362
## EEF1A1 4.068970
## NUDT19 3.044468
## RRM2 6.241193
## TSPYL1 0.000000
## SLC35F2 0.000000
## RAP2B 4.590000
## ifc800.COL11_ROW14_LIB018723-GEN00047362
## EEF1A1 3.585646
## NUDT19 0.000000
## RRM2 0.000000
## TSPYL1 4.485165
## SLC35F2 0.000000
## RAP2B 4.133311
## ifc800.COL11_ROW15_LIB018723-GEN00047362
## EEF1A1 4.755991
## NUDT19 3.317918
## RRM2 0.000000
## TSPYL1 0.000000
## SLC35F2 3.814146
## RAP2B 3.317918
## ifc800.COL11_ROW16_LIB018723-GEN00047362
## EEF1A1 0
## NUDT19 0
## RRM2 0
## TSPYL1 0
## SLC35F2 0
## RAP2B 0
## ifc800.COL11_ROW17_LIB018723-GEN00047362
## EEF1A1 3.963772
## NUDT19 0.000000
## RRM2 3.289438
## TSPYL1 0.000000
## SLC35F2 0.000000
## RAP2B 4.515215
## ifc800.COL11_ROW18_LIB018723-GEN00047362
## EEF1A1 5.280874
## NUDT19 0.000000
## RRM2 5.031013
## TSPYL1 0.000000
## SLC35F2 0.000000
## RAP2B 3.514237
## ifc800.COL11_ROW19_LIB018723-GEN00047362
## EEF1A1 4.909205
## NUDT19 2.848511
## RRM2 1.692879
## TSPYL1 0.000000
## SLC35F2 2.930683
## RAP2B 3.204935
## ifc800.COL11_ROW20_LIB018723-GEN00047362
## EEF1A1 4.854672
## NUDT19 0.000000
## RRM2 5.177928
## TSPYL1 3.911551
## SLC35F2 0.000000
## RAP2B 3.238215
Compute and visualize the sample-to-sample correlations
#prepare the data
x_top_500_corr<-x_top_500
#name sample so that it can output easily
#names is gonna name variables (in the dataframe it is column)
names(x_top_500_corr)<-1:ncol
# Compute the correlation between columns
#cor calculates correlation between variables of the data frame.(upper traingle)
correlation<-cor(x_top_500_corr)
# Visualize it using corrplot
# install.packages("corrplot")
library(corrplot)
## Warning: package 'corrplot' was built under R version 3.6.3
## corrplot 0.84 loaded
# plot correlation. used default setting
# sample against sample
# sample is ordered by hclust
corrplot(correlation, type="upper", order="hclust", tl.col="black", tl.srt=45)
Compute and visualize a hierarchical clustering of the samples, use package hclust
#prepare data
x_top_500_cluster<-x_top_500
#hclust uses dissimilarity structure.
#dist(t(x_top_500)) t is transposition. dist computes distance between rows. you want distance between samples. so had to transpose so that samples appears in the rows.
#p is power p=2 is euclidean
names(x_top_500_cluster)<-1:ncol
#compute distance matrix
dissimilarity_matrix<-dist(t(x_top_500_cluster), method="euclidean", diag=FALSE, upper=FALSE, p=2)
# Create and plot the cluster
cluster<-hclust(dissimilarity_matrix, method="complete", members=NULL)
plot(cluster)
Use the package pheatmap to generate a heatmap of the expression data.
# install.packages(pheatmap)
library(pheatmap)
## Warning: package 'pheatmap' was built under R version 3.6.3
#prepare the data
x_top_500_heat<-x_top_500
#names know that it will give names to columns(variables)
names(x_top_500_heat)<-1:ncol
row.names(x_top_500_heat)<-1:500
#computes heat map of dataframe (relationship between different samples) this shows hclust and heatmap
#x-axis is sample and y-axis gene (observation).
pheatmap(x_top_500_heat)
In the exercise session, we saw a potential case where the normal PCA can be misleading.
#constants to control the points
num_points=200
set.seed(6)
# generate the center points with 3 dimensions
# runif generates a vector of random numbers between 0 and 1
center_angles<-runif(num_points)*2*pi
center_x<- 5* runif(num_points)*cos(center_angles)
center_y<-3* runif(num_points)*sin(center_angles)
center_z<-1* runif(num_points)
center_class<-rep(1,times=num_points)
# generate the border points with 3 dimensions
border_angles<-runif(num_points)*2*pi
border_x<-(10+5*runif(num_points))*cos(border_angles)
border_y<-(7+3*runif(num_points))*sin(border_angles)
border_z<-(4+1*runif(num_points))*cos(border_angles)
border_class<-rep(2, times=num_points)
# generate the full data frame.
# pca_data have 3 columns. first half of 2nd column is center_class_x, 2nd half of 2nd column is border class_x
data<-data.frame(c(center_x, border_x), c(center_y, border_y), c(center_z, border_z), c(center_class, border_class))
names(data)<-c("x","y","z", "class")
# Plot the data, coloring them based on the class
# color is a list of 2elments but indexed by by class column of data.
plot(data$x,data$y, col=c("red","blue")[data$class])
* Do the PCA, plot the variance explained by the principal components. Select \(k\) such that you explain \(80\%\) of the variance in your data.
#compute and view the pareto plot of the pca
pca<-prcomp(data, center=TRUE, scale=TRUE)
summary(pca)
## Importance of components:
## PC1 PC2 PC3 PC4
## Standard deviation 1.3984 1.0159 0.9852 0.2050
## Proportion of Variance 0.4889 0.2580 0.2426 0.0105
## Cumulative Proportion 0.4889 0.7469 0.9895 1.0000
#calculates standard deviation associated every single principal component
#pca.var contains variance explained by each principal component
pca.var<-pca$sdev^2
qcc::pareto.chart(pca.var)
##
## Pareto chart analysis for pca.var
## Frequency Cum.Freq. Percentage Cum.Percent.
## A 1.95543374 1.95543374 48.88584354 48.88584354
## B 1.03200555 2.98743929 25.80013882 74.68598236
## C 0.97054084 3.95798013 24.26352100 98.94950336
## D 0.04201987 4.00000000 1.05049664 100.00000000
#in this case you need to choose 3 out of 4 principal components to explain over 80%
Select \(k\) such that you explain \(80\%\) of the variance in your data.
#select k=3 to explain at least 80% of the variance
k=3
# projection onto the lower dimension(=k dimension)
# scale function does "mean centering" and "multiplies each column by certain scale"
# you scale data such that pca$center is the mean of each column and every column is multiplied by this pca$scale
# pca$rotation are eigen vectors and then you selected first k columns (k eigen vectors)
# %*% makes matrix multiplication.
# data is the original data.this is not centered and scaled.
# but pca is done in mean centered and scaled data. so you need to scale data again
projected_points<- as.data.frame (scale(data, pca$center, pca$scale)%*%pca$rotation[,1:k])
#add additional column (class)
projected_points$class<-data$class
# v1 projection of all the points along the first principal components
names(projected_points)<-c("v1", "v2", "v3", "class")
head(projected_points)
## v1 v2 v3 class
## 1 -0.04417439 -1.0328398 -0.3740435 1
## 2 -0.52765379 -0.7389911 -0.5756853 1
## 3 -0.05238098 -0.6646648 -0.7585589 1
## 4 -0.07567400 -0.6099183 -0.8473754 1
## 5 -0.04369685 -0.9555762 -0.4158375 1
## 6 -0.14511580 -0.7979205 -0.5896901 1
#col=c("red","green")[pca_projected$class] if class=1 then this outputs red. if class=2 outputs green
plot(projected_points[,1], projected_points[,2], col=c("red", "blue")[projected_points$class])
problem: Before pca data was well separated by class,but after pca, the data is less separable even if you used prinipal components that explained above 80% variance (98%).This problem happened because this data is not linearly separable but pca works in the data that is linearly separable. So we use kernel to tackle this issue
#pca works in the data that is linearly separable. So use kernel
#data is original data. using 3rd column (z axis), 4th column(class). you are not changing these two columns
kernelized_data<-as.data.frame(data[c(3:4)])
#generated two columns ( r and a )
#r= reduced (distance of each point from origin)
kernelized_data$r<-(data$x^2+data$y^2)^0.5
#a=angle of points (angle or rotation between x-axis and points)
#atan=arc tan=inverse of tangent=gives you this angle
kernelized_data$a<-atan(data$y/data$x)
head(kernelized_data)
## z class r a
## 1 0.66135785 1 1.8439048 1.19259807
## 2 0.30567014 1 4.5972491 -0.01924905
## 3 0.48402936 1 0.7119307 -1.22399216
## 4 0.93482881 1 1.7824262 -0.68722690
## 5 0.09972553 1 1.6322270 -0.96448391
## 6 0.39079038 1 0.9642127 -0.29105009
#you have transformed data (kernelized data) into non-linear space so that when you use pca later it is linearly separable.
plot(kernelized_data$r, kernelized_data$a, col=c("red", "blue")[kernelized_data$class])
kernel_pca<-prcomp(kernelized_data, center=TRUE, scale=TRUE)
kernel_pca_var<-kernel_pca$sdev^2
qcc::pareto.chart(kernel_pca_var)
##
## Pareto chart analysis for kernel_pca_var
## Frequency Cum.Freq. Percentage Cum.Percent.
## A 1.95572526 1.95572526 48.89313151 48.89313151
## B 1.01012551 2.96585077 25.25313777 74.14626927
## C 0.98010736 3.94595813 24.50268390 98.64895318
## D 0.05404187 4.00000000 1.35104682 100.00000000
#select k=3 to explain at least 80% of the variance
k=3
projected_kernel_points<-as.data.frame(scale(kernelized_data, center=kernel_pca$center,scale=kernel_pca$scale)%*%kernel_pca$rotation[,1:k])
projected_kernel_points$class<-kernelized_data$class
names(projected_kernel_points)<-c("V1", "V2", "V3", "class")
plot(projected_kernel_points$V1, projected_kernel_points$V2, col=c("red", "blue")[projected_kernel_points$class])
In this case no. Both methods had to use 3 out of 4 PCs to explain at least 80%. But even in this case, kernel pca can separate points according to class with k=1 or k=2. (example shown below) Also in my opinion using kernel pca can lead to smaller k.
#select k=1 to show kernel pca can separate points using even 1 pc
k=1
projected_kernel_points_reduced<-as.data.frame(scale(kernelized_data, center=kernel_pca$center,scale=kernel_pca$scale)%*%kernel_pca$rotation[,1:k])
projected_kernel_points_reduced$class<-kernelized_data$class
names(projected_kernel_points_reduced)<-c("V1", "class")
#1:400 is an artificial axis. the only PC is y axis
plot(1:400, projected_kernel_points_reduced$V1, col=c("red", "blue")[projected_kernel_points$class])